# @Time : 2021/3/9 15:35
# @Author : chao
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import jieba
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer


def seg_sentence(sentence, stopwords):
    """对句子进行分词，并去除停用词
       自己安装jieba分词包：pip install jieba
    Args:
        sentence:句子(str)
        stopwords:停用词List
    Returns:
        outstr:分词并去除停用词的句子(str)
    """
    sentence_seged = jieba.cut(sentence.strip())
    outstr = ''
    for word in sentence_seged:

        if word not in stopwords:
            if word != '\t':
                outstr += word
                outstr += ' '
    return outstr


swPath = r'C:\Users\鲍超\Desktop\小论文——基于在线评论的物流客户满意度研究\代码\词典\stopword停用词.txt'
f = open(swPath, 'r', encoding='utf-8')
lines = f.readlines()
stopwords = list(map(lambda line: line.strip(),
                     filter(lambda x: x != '', lines)))
stopwords.append(' ')
stopwords = list(set(stopwords))
f.close()

with open(r'C:\Users\鲍超\Desktop\小论文——基于在线评论的物流客户满意度研究\代码\数据\预处理后数据\zong_data.txt', encoding='ANSI') as f:
    text = f.readlines()
seg_text = list(map(lambda review: seg_sentence(review, stopwords), text))

ngram_vectorizer = CountVectorizer(ngram_range=(1, 3), max_features=50)

count = ngram_vectorizer.fit_transform(seg_text).toarray()
featureName = ngram_vectorizer.get_feature_names()
featureName = list(map(lambda vec: vec.replace(' ', '_'), featureName))
seg_text = []
# 将此词频矩阵转换为TF—IDF值
tfidf_transformer = TfidfTransformer()
word_vec = tfidf_transformer.fit_transform(count).toarray()

name = np.matrix(featureName)
final = np.vstack((name, word_vec))
df = pd.DataFrame(final)

# 最终特征名称和数据保存在csv文件中
df.to_csv(r'C:\Users\鲍超\Desktop\小论文——基于在线评论的物流客户满意度研究\代码\特征提取\特征提取数据\zong_tfidf.csv',
          index=False, header=False, encoding='gbk')

# 价格 信息 公司 包裹 卖家 地址 客户 客服 快件 态度 服务 服务态度 物流 电话
# 网点 速度